Data Wrangling COVID-19 Mini-Challenge

Imports/Setup

1 import pandas as pd
2 import wikidata_plain_sparql as wikidata
3
4 from bokeh.palettes import turbo
5 from bokeh.plotting import figure, output_notebook, show, gridplot
6
7 # set bokeh output mode to notebook
8 output_notebook()
9
10 from helper import get_jhu_cached, create_grid

Loading BokehJS ...

Worldwide COVID-19 Data

1 # get all dates from earliest available data until yesterday
2 all_dates = pd.date_range(start='2020-01-22', end='today')[:-1]
3
4 # standardize column names for all entries
5 def rename_columns(column):
6 column_map = {
7 'Lat': 'Latitude',
8 'Long_': 'Longitude',
9 'Incidence_Rate': 'Incident_Rate'
10 }
11 if column in column_map:
12 return column_map[column]
13 return column.replace('/', '_').replace('-', '_').replace(' ', '_')
14
15 # load data from all dates
16 all_data = []
17 for date in all_dates:
18 data = pd.read_csv(get_jhu_cached(date))
19 data.rename(columns=rename_columns, inplace=True)
20 data['Date'] = date
21 all_data.append(data)
22
23 # combine data from all days
24 raw_data = pd.concat(all_data)
1 # remove cruise ships from countries
2 exclude_countries = [
3 'MS Zaandam',
4 'Diamond Princess',
5 'Cruise Ship'
6 ]
7
8 raw_data = raw_data[~raw_data['Country_Region'].isin(exclude_countries)]
9
10 # standardize country names for all entries
11 country_mapping = {
12 'Hong Kong.+': 'Hong Kong',
13 'Iran.+': 'Iran',
14 '.*Congo.*': 'Congo',
15 'Mainland China': 'China',
16 '.*Bahamas.*': 'The Bahamas',
17 '.*Gambia.*': 'The Gambia',
18 'Viet Nam': 'Vietnam',
19 'Taiwan\*': 'Taiwan',
20 'Cote d\'Ivoire': 'Ivory Coast',
21 'Cabo Verde': 'Cape Verde',
22 'Russian Federation': 'Russia',
23 ' Azerbaijan': 'Azerbaijan',
24 'Holy See': 'Vatican City',
25 'Republic of Ireland': 'Ireland',
26 'Republic of Moldova': 'Moldova',
27 'Czechia': 'Czech Republic',
28 'Republic of Korea|Korea, South': 'South Korea',
29 'Timor-Leste': 'East Timor',
30 'Macao SAR|Macau': 'Macao',
31 'UK': 'United Kingdom',
32 'Jersey|Guernsey': 'Channel Islands',
33 'Dominica': 'Dominican Republic'
34 }
35
36 raw_data['Country_Region'] = raw_data['Country_Region'].replace(to_replace=country_mapping.keys(), value=country_mapping.values(), regex=True)
1 # group data by country
2 updates_per_country = raw_data.groupby(['Country_Region', 'Date']).agg(
3 Confirmed = ('Confirmed','sum'),
4 Deaths = ('Deaths','sum'),
5 ).reset_index()
6
7 # get all cantons
8 all_countries = updates_per_country['Country_Region'].unique()
9 all_countries.sort()
10
11 # calculate difference between days
12 for country in all_countries:
13 updates_for_country = updates_per_country.loc[updates_per_country['Country_Region'] == country]
14 previous_index = None
15 total_cases = 0
16 for index in updates_for_country.index:
17 if previous_index != None:
18 new_cases = updates_per_country.at[index, 'Confirmed'] - updates_per_country.at[previous_index, 'Confirmed']
19 updates_per_country.at[index, 'New_Cases'] = new_cases
20 total_cases += new_cases
21 updates_per_country.at[index, 'Total_Cases'] = total_cases
22 updates_per_country.at[index, 'New_Deaths'] = updates_per_country.at[index, 'Deaths'] - updates_per_country.at[previous_index, 'Deaths']
23 previous_index = index
24
25 updates_per_country = updates_per_country[updates_per_country['Date'] >= '2020-01-23']
26
27 worldwide_pretty = updates_per_country.loc[:, ['Date', 'Country_Region', 'New_Cases', 'Total_Cases', 'New_Deaths']]
1 # calculate new cases
2 new_cases_graph = figure(title="New COVID-19 cases per country", y_axis_label='new cases', x_axis_type='datetime', sizing_mode='stretch_width')
3 palette = turbo(all_countries.size)
4 i = 0
5 for country in all_countries:
6 updates_for_country = updates_per_country.loc[updates_per_country['Country_Region'] == country]
7 new_cases_graph.line(updates_for_country['Date'], updates_for_country['New_Cases'], line_color=palette[i], legend_label=country, line_width=2)
8 i += 1
9 show(new_cases_graph)

1 # show total cases
2 total_cases_graph = figure(title="Total COVID-19 cases per country", y_axis_label='total cases', x_axis_type='datetime', sizing_mode='stretch_width')
3 palette = turbo(all_countries.size)
4 i = 0
5 for country in all_countries:
6 updates_for_country = updates_per_country.loc[updates_per_country['Country_Region'] == country]
7 total_cases_graph.line(updates_for_country['Date'], updates_for_country['Total_Cases'], line_color=palette[i], legend_label=country, line_width=2)
8 i += 1
9 show(total_cases_graph)

Swiss COVID-19 Data

1 # get population data from WikiData
2 canton_data = wikidata.query('''
3 SELECT ?shortCode ?population ?canton WHERE {
4 ?canton wdt:P31 wd:Q23058.
5 ?canton wdt:P300 ?shortCode.
6 OPTIONAL {
7 ?canton p:P1082 ?population_stmt.
8 ?population_stmt ps:P1082 ?population.
9 ?population_stmt pq:P585 ?population_date.
10 }
11 FILTER NOT EXISTS {
12 ?canton p:P1082/pq:P585 ?population_date_.
13 FILTER (?population_date_ > ?population_date)
14 }
15 }
16 ORDER BY ?shortCode
17 ''')
18 canton_data.set_index('shortCode', inplace=True)
1 raw_data = pd.read_csv('https://raw.githubusercontent.com/openZH/covid_19/master/COVID19_Fallzahlen_CH_total_v2.csv')
2
3 # convert to date
4 raw_data['date'] = pd.to_datetime(raw_data['date'])
5
6 # remove FL
7 swiss = raw_data[raw_data['abbreviation_canton_and_fl'] != 'FL']
8
9 # only use data after 1st of june
10 swiss = swiss[swiss['date'] >= '2020-05-31']
11
12 # only keep useful entries
13 conf_cases = swiss.loc[-swiss['ncumul_conf'].isna()].copy()
14
15 # get all cantons
16 all_cantons = conf_cases['abbreviation_canton_and_fl'].unique()
17 all_cantons.sort()
18
19 # calculate new cases
20 conf_cases['new_cases'] = 0
21
22 for canton in all_cantons:
23 updates_for_canton = conf_cases.loc[conf_cases['abbreviation_canton_and_fl'] == canton]
24 previous_index = None
25 total_cases = 0
26 for index in updates_for_canton.index:
27 if previous_index != None:
28 new_cases = conf_cases.at[index, 'ncumul_conf'] - conf_cases.at[previous_index, 'ncumul_conf']
29 conf_cases.at[index, 'new_cases'] = new_cases
30 conf_cases.at[index, 'new_cases_relative'] = new_cases / int(canton_data.at['CH-' + canton, 'population']) * 100000
31 total_cases += new_cases
32 conf_cases.at[index, 'total_cases'] = total_cases
33 conf_cases.at[index, 'total_cases_relative'] = total_cases / int(canton_data.at['CH-' + canton, 'population']) * 100000
34 conf_cases.at[index, 'new_deaths'] = conf_cases.at[index, 'ncumul_deceased'] - conf_cases.at[previous_index, 'ncumul_deceased']
35 previous_index = index
36
37 conf_cases = conf_cases[conf_cases['date'] >= '2020-06-01']
38
39 swiss_pretty = conf_cases.loc[:, ['date', 'abbreviation_canton_and_fl', 'new_cases', 'total_cases', 'new_deaths']]

New COVID-19 cases per 100'0000 residents

1 # calculate new cases
2 graphs = []
3 max_new_cases = conf_cases['new_cases_relative'].max()
4 for canton in all_cantons:
5 update_for_canton = conf_cases.loc[conf_cases['abbreviation_canton_and_fl'] == canton].copy()
6
7 update_for_canton['new_cases_relative_avg'] = update_for_canton['new_cases_relative'].rolling(window=7).mean()
8
9 new_cases_graph = figure(title=canton, y_axis_label='new cases', y_range=[0, max_new_cases], x_axis_type='datetime')
10 new_cases_graph.line(update_for_canton['date'], update_for_canton['new_cases_relative'], line_width=1)
11 new_cases_graph.line(update_for_canton['date'], update_for_canton['new_cases_relative_avg'], line_color='red', line_width=1)
12 graphs.append(new_cases_graph)
13
14 show(create_grid(graphs, sizing_mode='scale_width'))

Total COVID-19 cases since 1st of june per 100'000 residents

1 # calculate total cases
2 graphs = []
3 max_total_cases = conf_cases['total_cases_relative'].max()
4 for canton in all_cantons:
5 update_for_canton = conf_cases.loc[conf_cases['abbreviation_canton_and_fl'] == canton]
6
7 total_cases_graph = figure(title=canton, y_axis_label='total cases', y_range=[0, max_total_cases], x_axis_type='datetime')
8 total_cases_graph.line(update_for_canton['date'], update_for_canton['total_cases_relative'], line_width=1)
9 graphs.append(total_cases_graph)
10
11 show(create_grid(graphs, sizing_mode='scale_width'))

Final Data Frames

1 worldwide_pretty

Date Country_Region New_Cases Total_Cases New_Deaths
0 2020-02-24 Afghanistan NaN NaN NaN
1 2020-02-25 Afghanistan 0.0 0.0 0.0
2 2020-02-26 Afghanistan 0.0 0.0 0.0
3 2020-02-27 Afghanistan 0.0 0.0 0.0
4 2020-02-28 Afghanistan 0.0 0.0 0.0
... ... ... ... ... ...
79027 2020-03-12 occupied Palestinian territory 0.0 -25.0 0.0
79028 2020-03-14 occupied Palestinian territory 0.0 -25.0 0.0
79029 2020-03-15 occupied Palestinian territory 0.0 -25.0 0.0
79030 2020-03-16 occupied Palestinian territory 0.0 -25.0 0.0
79031 2020-03-17 occupied Palestinian territory 0.0 -25.0 0.0

79024 rows × 5 columns

1 swiss_pretty

date abbreviation_canton_and_fl new_cases total_cases new_deaths
2346 2020-06-01 BL 1 1.0 0.0
2347 2020-06-01 FR 0 0.0 0.0
2348 2020-06-01 GE 1 1.0 0.0
2349 2020-06-01 GR 0 0.0 0.0
2350 2020-06-01 JU 0 0.0 0.0
... ... ... ... ... ...
10697 2021-05-01 NW 8 2337.0 0.0
10698 2021-05-01 TI 43 28899.0 0.0
10699 2021-05-01 BE 137 60386.0 1.0
10700 2021-05-01 SZ 39 10873.0 0.0
10701 2021-05-01 BS 53 11053.0 0.0

7828 rows × 5 columns

1